# Copyright (c) 2019-2021 NVIDIA CORPORATION. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import math

#######################################################################################################################################################################


def unpad_input(out_, in_, indices):
    out_[:, :] = in_[indices[:], :]


def pad_input(out_, in_, indices):
    out_[indices[:], :] = in_[:, :]


def unpad_mask(out_, in_, indices):
    out_[:] = in_.flatten()[indices[:]]


#######################################################################################################################################################################


def generate_mask(attention_mask,
                  heads,
                  pad=False,
                  fuse_mask=True,
                  unpad_fmha=False):
    if unpad_fmha:
        seqlen = attention_mask.sum(dim=1).to(dtype=torch.int32).flatten()
        indices = torch.nonzero(attention_mask.flatten(),
                                as_tuple=False).flatten()
        maxseqlen = seqlen.max().item()
        b = attention_mask.shape[0]
        cu_seqlens = torch.zeros(b + 1,
                                 device=attention_mask.device,
                                 dtype=torch.int32)
        cu_seqlens[1:] = torch.cumsum(seqlen, dim=0)
        ntokens = cu_seqlens[-1].item()
        return indices, attention_mask, seqlen, ntokens, cu_seqlens, seqlen, maxseqlen

    seqlen = attention_mask.sum(dim=1).float().cpu()
    if pad == False:
        seqlen[:] = ((seqlen[:] + 16 - 1) / 16).floor() * 16
        seqlen[seqlen < 16] = 16
        seqlen = seqlen.int()
        ntokens = seqlen.sum().item()
    else:
        batch = attention_mask.shape[0]
        maxseqlen = attention_mask.shape[1]
        seqlen.fill_(maxseqlen)
        seqlen = seqlen.int()
        ntokens = batch * maxseqlen

    padded_mask = attention_mask.clone()
    for i in range(len(seqlen)):
        padded_mask[i, :seqlen[i]] = 1
    indices = torch.nonzero(padded_mask.flatten(), as_tuple=False).flatten()

    if pad == False and fuse_mask == True:
        mask = torch.zeros([ntokens], device="cuda", dtype=torch.float16)
        unpad_mask(mask, attention_mask, indices)
        mask = (1 - mask) * -10000.0
    elif pad == False and fuse_mask == False:
        padded_mask = (padded_mask.unsqueeze(1) *
                       padded_mask.unsqueeze(2)).unsqueeze(1).half().repeat(
                           1, heads, 1, 1)
        indices_mask = torch.nonzero(padded_mask.flatten(),
                                     as_tuple=False).flatten()
        mask = torch.zeros([len(indices_mask)],
                           device="cuda",
                           dtype=torch.float16)
        unpad_mask(mask, padded_mask, indices_mask)
        mask = (1 - mask) * -10000.0
    elif pad == True and fuse_mask == True:
        mask = -10000.0 * (1 - attention_mask).half().view(-1)
    elif pad == True and fuse_mask == False:
        mask = -10000.0 * (
            1 - (attention_mask.unsqueeze(1) * attention_mask.unsqueeze(2))
        ).unsqueeze(1).half().repeat(1, heads, 1, 1).view(-1)

    return indices, mask, seqlen, ntokens, None, None, None


#######################################################################################################################################################################


class PadInput(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input, indices, batch, maxseqlen, hidden, ntokens):
        ctx.save_for_backward(indices)
        ctx.hidden = hidden
        ctx.ntokens = ntokens
        ntokens = batch * maxseqlen

        output = torch.zeros([ntokens, hidden],
                             device="cuda",
                             dtype=torch.float16)
        pad_input(output, input, indices)

        return output[:ntokens]

    @staticmethod
    def backward(ctx, grad_output):
        indices, = ctx.saved_tensors

        grad_input = torch.zeros([ctx.ntokens, ctx.hidden],
                                 device="cuda",
                                 dtype=torch.float16)
        unpad_input(grad_input, grad_output, indices)

        return grad_input[:ctx.ntokens], None, None, None, None, None


#######################################################################################################################################################################


class UnpadInput(torch.autograd.Function):

    @staticmethod
    def forward(ctx, input, indices, batch, maxseqlen, hidden, ntokens):
        ctx.save_for_backward(indices)
        ctx.hidden = hidden
        ctx.ntokens = batch * maxseqlen

        output = torch.zeros([ntokens, hidden],
                             device="cuda",
                             dtype=torch.float16)
        unpad_input(output, input, indices)

        return output[:ntokens]

    @staticmethod
    def backward(ctx, grad_output):
        indices, = ctx.saved_tensors

        grad_input = torch.zeros([ctx.ntokens, ctx.hidden],
                                 device="cuda",
                                 dtype=torch.float16)
        pad_input(grad_input, grad_output, indices)

        return grad_input[:ctx.ntokens], None, None, None, None, None


#######################################################################################################################################################################
